# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Sample tweets for round 1 of crowd coding 
#' @author Hauke Licht
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----
set.seed(1234)

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

# source(file.path("R", "utils.R"))

base_path <- file.path(".")
data_path <- file.path(base_path, "data")
fits_path <- file.path(base_path, data_path, "fits")

# load data ----

## load tweets classified into political/non-political ----

# note: generated in script code/01-preproc/03-classify_tweets_political.R
fp <- file.path(data_path, "input", "all_tweets_classified_political.rds")
classified_tweets <- read_rds(fp) %>% 
  filter(collected_posthoc == "no") %>% 
  # note: `is_en` indicates whether English transaltion available
  mutate(is_en = ifelse(is.na(is_en), FALSE, is_en))

## load LASER independent component representations ----

# tweet text LASER embedding independent component (IC) representations
# note: generated in script code/01-preproc/02-reduce_laser_embeddings_dimensionality.R
fp <- file.path(fits_path, "laser_embeddings_ica.rds")
tweet_laser_ics <- read_rds(fp)$S 

# convert to data frame
tweet_laser_ics <- bind_cols(id = rownames(tweet_laser_ics), as_tibble(tweet_laser_ics))
tweet_laser_ics <- separate(tweet_laser_ics, id, c("user_id", "status_id"), sep = "_")

# join with classified tweets
classified_tweets_ics <- left_join(classified_tweets, tweet_laser_ics)
rm(tweet_laser_ics); gc()

# tabulate eligible tweets 
count(classified_tweets, political, is_en)

# load/generate tweet clustering ----

fp <- file.path(fits_path, "political_en_tweets_clusters.rds")
if (file.exists(fp)) {
  # load k-means cluster assignments of machine-translated political tweets
  k_means500 <- read_rds(fp)
  
  clustered_tweets <- classified_tweets_ics %>% 
    filter(is_en, political == "yes") %>% 
    mutate(cluster_id = fitted(k_means500, "class"))

} else {
  # cluster machine-translated political tweets using k-means with k = 500
  st <- Sys.time()
  k_means500 <- classified_tweets_ics %>% 
    filter(is_en, political == "yes") %>% 
    select(starts_with("ic")) %>% 
    as.matrix() %>% 
    kmeans(centers = 500, iter.max = 100L)
  k_means500$runtime <- Sys.time()-st
  
  if (k_means500$ifault == 4)
    stop("Could not cluster tweets. Try with different algorith, e.g., 'Lloyd'.")
  
  message(sprintf("500 clusters obtained in %d iterations using %s algorithm", k_means500$iter, dQuote("Hartigan-Wong")))
  
  # assign tweets to clusters
  clustered_tweets <- classified_tweets_ics %>% 
    filter(is_en, political == "yes") %>% 
    mutate(cluster_id = fitted(k_means500, "class"))
  
  names(k_means500$cluster) <- sprintf("%s_%s", clustered_tweets$user_id, clustered_tweets$status_id)
  
  save_rds(k_means500, fp)
}


# plot cluster sizes 
cluster_sizes <- tibble(cluster_id = 1:500, size = as.integer(k_means500$size))

cluster_sizes %>%
  ggplot(aes(x = size)) + 
  geom_histogram(alpha = .9, fill = "black", bins = 50) + 
  labs(
    title = "Distribution of cluster sizes of 500 k-means clusters."
    , subtitle = "Clusters obtained from 300 independent components of tweet LASER embedding representations."
    , x = NULL
    , y = "Count"
  )

# obtain t-SNE cluster centroid ----

fp <- file.path(fits_path, "political_en_tweets_clusters_tsnes.rds")

if (file.exists(fp)) {
  centroid_tsnes <- read_rds(fp)
} else {
  library(future)
  plan(multisession) # change to 'multicore' on Windows
  library(furrr)
  library(tsne)
  
  # project cluster centroids to two dimensions using tSNE 
  # with different degrees of perplexity
  centroid_tsnes <- future_map_dfr(c(3, 5, 10, 30, 60, 90), function(p){
    set.seed(1234)
    pc_tsne <- tsne::tsne(k_means500$centers, k = 2, perplexity = p, whiten = FALSE)
    colnames(pc_tsne) <- paste0("d", 1:2)
    bind_cols(perplexity = p, pc_tsne, cluster_size = k_means500$size)
  })
  
  save_rds(centroid_tsnes, fp)
}

# plot t-SNE centroids
centroid_tsnes %>% 
  ggplot(aes(x = d1, y = d2)) + 
    geom_point(alpha = .25, size = .5) + 
    facet_wrap(~perplexity) + 
    labs(
      title = "Two-dimensional t-SNE representation of cluster centroids"
      , subtitle = "Facets report results at different perplexity values"
      , x = NULL
      , y = NULL
    )

# generate cluster units ----

cluster_unit_sizes <- clustered_tweets %>%
  group_by(cluster_id) %>%
  summarise(
    n_countries = n_distinct(country_iso3c)
    , n_parties = n_distinct(party_id)
    , cluster_size = n()
  ) %>%
  ungroup()

# plot cluster unit sizes
cluster_unit_sizes %>% 
  ggplot(aes(x = n_countries, group = n_countries, y = n_parties, size = cluster_size)) + 
    geom_boxplot(fill = NA, alpha = .5, show.legend = F) + 
    labs(
      title = "Cluster composition: country and party diversity."
      , x = "Number of countries"
      , y = "Number of parties"
    )


# determine cluster sample sizes ----
determine_sample_size <- function(x) ceiling(log2(x))+1
s <- with(cluster_unit_sizes, determine_sample_size(n_parties))

# visualize rule
cluster_unit_sizes %>% 
  ggplot(aes(cluster_size, n_parties)) +
    geom_point(alpha = .5, size = .5) + 
    geom_smooth(color = "black")

# visualize distribution
cluster_unit_sizes %>% 
  ggplot(aes(x = s, y = cluster_size, group = s)) + 
  geom_boxplot() + 
  scale_x_continuous(breaks = 1:9) +
  coord_flip() + 
  labs(
    x = "Sample size"
    , y = "Cluster size"
  )

# sample tweets ----

fp <- file.path(data_path, "intermediate", "samples", "tweets_sample_1.rds")
if (file.exists(fp)) {
  sampled_tweets <- read_rds(fp)
} else {
  sampled_tweets <- select(clustered_tweets, 1:13, cluster_id) %>% 
    split(.$cluster_id) %>% 
    purrr::map2_dfr(
      .x = .
      , .y = s    
      , function(.x, .y) {
        .x %>% 
          group_by(user_id) %>% 
          sample_n(1) %>% 
          ungroup() %>% 
          sample_n(.y) %>% 
          mutate(cluster_sample_size = .y)
      }
    ) %>% 
    sample_frac(1)
  
  save_rds(sampled_tweets, fp)
}

# save clustered tweets ----


clustered_tweets <- clustered_tweets %>%
  left_join(
    rename_all(mutate(cluster_unit_sizes, sample_size = s), ~ifelse(grepl("^cluster_", .), ., paste0("cluster_", .)))
    , by = "cluster_id"
  ) %>%
  left_join(
    mutate(select(sampled_tweets, 1:5), sample = "sample 1")
  )

fp <- file.path(fits_path, "political_en_tweets_clustered.rds")
if (!file.exists(fp))
  write_rds(clustered_tweets, fp)

